From 4d9de9b9847a4703d0da11b3ab95c9d4e4201c4b Mon Sep 17 00:00:00 2001 From: Boyuan Yang <073plan@gmail.com> Date: Mon, 9 Oct 2017 21:42:06 +0800 Subject: [PATCH] New upstream version 1.0.5 --- CMakeLists.txt | 21 ++++++---- NEWS.md | 11 +++++ README.md | 20 +++++---- binding.gyp | 1 - data/CMakeLists.txt | 2 + data/dictionary/STCharacters.txt | 12 ++++++ data/dictionary/STPhrases.txt | 66 +++++++++++++++--------------- data/dictionary/TWPhrasesOther.txt | 1 + data/dictionary/TWVariants.txt | 5 +++ data/scheme/st_multi.txt | 2 +- doc/CMakeLists.txt | 2 +- node/binding.cc | 45 +++++++++++++++++++- node/demo.js | 15 +++++-- node/dict.js | 6 +++ node/dicts.gypi | 66 +++++++++++++++--------------- node/global.gypi | 2 +- node/node_binding.gypi | 13 ------ node/opencc.js | 46 +++++++++++++++++++++ node/opencc_dict.gypi | 21 ---------- package.json | 6 +-- src/BinaryDict.cpp | 30 +++++++------- src/CMakeLists.txt | 5 ++- src/Config.cpp | 8 +++- src/ConfigTest.cpp | 10 +++++ src/DartsDict.cpp | 1 - src/Dict.cpp | 4 +- src/DictConverter.cpp | 57 ++++++++++++++++++++++++++ src/DictConverter.hpp | 30 ++++++++++++++ src/DictEntry.hpp | 6 +-- src/Exception.hpp | 2 +- src/PhraseExtract.cpp | 25 ++++++----- src/PhraseExtract.hpp | 2 +- src/SimpleConverter.cpp | 11 ++--- src/UTF8StringSlice.hpp | 28 +++++++------ src/UTF8Util.hpp | 10 +++-- src/tools/DictConverter.cpp | 35 +--------------- 36 files changed, 403 insertions(+), 224 deletions(-) create mode 100644 node/dict.js delete mode 100644 node/opencc_dict.gypi create mode 100644 src/DictConverter.cpp create mode 100644 src/DictConverter.hpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 5ff33eb..2a29415 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -32,7 +32,7 @@ set (PACKAGE_URL https://github.com/BYVoid/Opencc) set (PACKAGE_BUGREPORT https://github.com/BYVoid/Opencc/issues) set (OPENCC_VERSION_MAJOR 1) set (OPENCC_VERSION_MINOR 0) -set (OPENCC_VERSION_REVISION 4) +set (OPENCC_VERSION_REVISION 5) if (CMAKE_BUILD_TYPE MATCHES Debug) set (version_suffix .Debug) @@ -53,10 +53,10 @@ include(CPack) ######## Windows -if (WIN32) - set(CMAKE_SHARED_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX}) - set(CMAKE_STATIC_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX}) -endif (WIN32) +#if (WIN32) +# set(CMAKE_SHARED_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX}) +# set(CMAKE_STATIC_LIBRARY_PREFIX ${CMAKE_INSTALL_PREFIX}) +#endif (WIN32) ######## Mac OS X @@ -68,7 +68,6 @@ set (DIR_PREFIX ${CMAKE_INSTALL_PREFIX}) set (DIR_INCLUDE ${DIR_PREFIX}/include/) set (DIR_SHARE ${DIR_PREFIX}/share/) set (DIR_ETC ${DIR_PREFIX}/etc/) -set (LIB_SUFFIX "") set (DIR_LIBRARY ${DIR_PREFIX}/lib${LIB_SUFFIX}/) if (DEFINED SHARE_INSTALL_PREFIX) @@ -83,8 +82,12 @@ if (DEFINED SYSCONF_INSTALL_DIR) set (DIR_ETC ${SYSCONF_INSTALL_DIR}) endif (DEFINED SYSCONF_INSTALL_DIR) -set (DIR_SHARE_OPENCC ${DIR_SHARE}opencc/) -set (DIR_SHARE_LOCALE ${DIR_SHARE}locale/) +if (DEFINED LIB_INSTALL_DIR) + set (DIR_LIBRARY ${LIB_INSTALL_DIR}) +endif (DEFINED LIB_INSTALL_DIR) + +set (DIR_SHARE_OPENCC ${DIR_SHARE}/opencc/) +set (DIR_SHARE_LOCALE ${DIR_SHARE}/locale/) ######## Configuration @@ -130,7 +133,7 @@ elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") endif () elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") add_definitions( - /Wall + /W4 /D "_CRT_SECURE_NO_WARNINGS" ) endif() diff --git a/NEWS.md b/NEWS.md index 90553e4..b3c2fc6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,16 @@ # Change History of OpenCC +## Version 1.0.5 + +2017年2月6日 + +* 修正Windows下CMake和Visual Studio的問題。 +* 修正FNV Hash的32位編譯警告。 +* 增加若干臺灣常用詞彙轉換和異體字轉換。 +* 增加和修正若干轉換問題。 +* 加快Node模塊編譯速度。 +* 增加Node模塊的詞典轉換接口和Promise接口。 + ## Version 1.0.4 2016年4月1日 diff --git a/README.md b/README.md index 1ff2e91..9930340 100644 --- a/README.md +++ b/README.md @@ -1,5 +1,8 @@ # Open Chinese Convert 開放中文轉換 +[ ![Download](https://api.bintray.com/packages/byvoid/opencc/OpenCC/images/download.svg) ](https://bintray.com/byvoid/opencc/OpenCC/_latestVersion) +[![Build Status](https://travis-ci.org/BYVoid/OpenCC.svg?branch=master)](https://travis-ci.org/BYVoid/OpenCC) + ## Introduction 介紹 Open Chinese Convert (OpenCC, 開放中文轉換) is an opensource project for conversion between Traditional Chinese and Simplified Chinese, supporting character-level conversion, phrase-level conversion, variant conversion and regional idioms among Mainland China, Taiwan and Hong kong. @@ -28,7 +31,7 @@ Open Chinese Convert (OpenCC, 開放中文轉換) is an opensource project for c * [Ubuntu](https://launchpad.net/ubuntu/+source/opencc) * [Fedora](https://admin.fedoraproject.org/pkgdb/package/opencc/) * [Arch Linux](https://www.archlinux.org/packages/community/x86_64/opencc/) -* [Mac OS](https://github.com/mxcl/homebrew/blob/master/Library/Formula/opencc.rb) +* [Mac OS](https://github.com/Homebrew/homebrew-core/blob/master/Formula/opencc.rb) * [Node.js](https://npmjs.org/package/opencc) ## Download 下載 @@ -62,8 +65,6 @@ https://bintray.com/byvoid/opencc/OpenCC ## Build 編譯 -[![Build Status](https://travis-ci.org/BYVoid/OpenCC.svg?branch=master)](https://travis-ci.org/BYVoid/OpenCC) - ### Build with CMake Linux (gcc 4.6 is required): @@ -83,15 +84,15 @@ sudo make PREFIX=/usr/local install Windows MSYS: ``` -cmake .. -G "MSYS Makefiles" -DCMAKE_INSTALL_PREFIX="" -DCMAKE_BUILD_TYPE=Release -make +cmake -H. -Bbuild -G "MSYS Makefiles" -DCMAKE_INSTALL_PREFIX="path/to/install" +cmake --build build --config Release --target install ``` Windows Visual Studio (2013 or higher required): ``` -cmake .. -G "Visual Studio 12" -DCMAKE_INSTALL_PREFIX="" -DCMAKE_BUILD_TYPE=Release -make +cmake -H. -Bbuild -G"Visual Studio 12" -DCMAKE_INSTALL_PREFIX="path/to/install" +cmake --build build --config Release --target install ``` ### iOS @@ -165,5 +166,10 @@ https://github.com/BYVoid/OpenCC/blob/master/NEWS.md * [Flandre Scarlet](https://github.com/XadillaX) * [宋辰文](https://github.com/songchenwen) * [iwater](https://github.com/iwater) +* [Xpol Wan](https://github.com/xpol) +* [Weihang Lo](https://github.com/weihanglo) +* [Cychih](https://github.com/pi314) +* [kyleskimo](https://github.com/kyleskimo) +* [Ryuan Choi](https://github.com/bunhere) Please update this list you have contributed OpenCC. diff --git a/binding.gyp b/binding.gyp index a86dcb8..89ae598 100644 --- a/binding.gyp +++ b/binding.gyp @@ -2,7 +2,6 @@ "includes": [ "node/global.gypi", "node/configs.gypi", - "node/opencc_dict.gypi", "node/dicts.gypi", "node/node_binding.gypi", ] diff --git a/data/CMakeLists.txt b/data/CMakeLists.txt index 2deb764..1516775 100644 --- a/data/CMakeLists.txt +++ b/data/CMakeLists.txt @@ -109,6 +109,8 @@ foreach(DICT ${DICTS}) ${DICT}.ocd COMMENT "Building ${DICT}.ocd" + COMMAND + ${CMAKE_COMMAND} -E copy "$" "$" COMMAND ${OPENCC_DICT_BIN} --input ${DICT_${DICT}_INPUT} diff --git a/data/dictionary/STCharacters.txt b/data/dictionary/STCharacters.txt index b937f52..6d9ff27 100644 --- a/data/dictionary/STCharacters.txt +++ b/data/dictionary/STCharacters.txt @@ -3878,9 +3878,21 @@ 𫠒 鱆 𫠖 𩿅 𫠜 齯 +𫢸 僤 +𫮃 墠 +𫰛 娙 +𫶇 嵽 +𫷷 廞 +𫸩 彄 +𬀩 暐 𬬭 錀 𬬻 鑪 𬭊 𨧀 𬭛 𨨏 𬭳 𨭎 𬭶 𨭆 +𬶋 鮈 +𬶍 鮀 +𬶏 鮠 +𬶟 鯻 +𬸪 鷭 diff --git a/data/dictionary/STPhrases.txt b/data/dictionary/STPhrases.txt index 11be47c..28f14fd 100644 --- a/data/dictionary/STPhrases.txt +++ b/data/dictionary/STPhrases.txt @@ -938,7 +938,7 @@ 下注解 下註解 下游 下游 下游工业 下游工業 -下确界 下确界 +下确界 下確界 下种 下種 下笔千言 下筆千言 下签 下籤 @@ -5008,7 +5008,7 @@ 僵固 僵固 僵固性 僵固性 僵尸 殭屍 -僵尸网络 僵屍網絡 +僵尸网络 殭屍網絡 僵局 僵局 僵持 僵持 僵持不下 僵持不下 @@ -6696,7 +6696,7 @@ 几下 幾下 几世 幾世 几世纪 幾世紀 -几丝 几絲 +几丝 幾絲 几两 幾兩 几个 幾個 几个人 幾個人 @@ -9427,7 +9427,7 @@ 千里之行 千里之行 千里命驾 千里命駕 千里始足下 千里始足下 -千里姻缘一线牵 千裏姻緣一線牽 +千里姻缘一线牵 千里姻緣一線牽 千里寄鹅毛 千里寄鵝毛 千里搭长棚 千里搭長棚 千里犹面 千里猶面 @@ -11505,7 +11505,7 @@ 受制 受制 受制于 受制於 受制于人 受制於人 -受命于天 受命于天 +受命于天 受命於天 受困 受困 受夠了 受夠了 受尽 受盡 @@ -11517,7 +11517,7 @@ 受托者 受託者 受折磨 受折磨 受用不尽 受用不盡 -受聘于 受聘于 +受聘于 受聘於 受阻于 受阻於 受限于 受限於 受难曲 受難曲 @@ -13242,7 +13242,7 @@ 吊祭 弔祭 吊稍 吊稍 吊窗 吊窗 -吊篮 弔籃 +吊篮 吊籃 吊索 吊索 吊纸 弔紙 吊线 吊線 @@ -13254,7 +13254,7 @@ 吊脚儿事 弔腳兒事 吊腰撒跨 弔腰撒跨 吊膀子 吊膀子 -吊臂 弔臂 +吊臂 吊臂 吊衣架 吊衣架 吊袜 吊襪 吊袜带 吊襪帶 @@ -15039,7 +15039,7 @@ 哪里 哪裏 哪里买 哪裏買 哪里人 哪裏人 -哪里哪里 哪里哪里 +哪里哪里 哪裏哪裏 哪里摔倒哪里爬 哪裏摔倒哪裏爬 哭个 哭個 哭个夠 哭個夠 @@ -17134,7 +17134,7 @@ 大不里士 大不里士 大丑 大丑 大专同学 大專同學 -大专杯 大專杯 +大专杯 大專盃 大业千秋 大業千秋 大个 大個 大个儿 大個兒 @@ -18164,7 +18164,7 @@ 威布里吉 威布里吉 威廉亚历山大 威廉亞歷山大 威比苏诺 威比蘇諾 -威氏注音法 威氏註音法 +威氏注音法 威氏注音法 威里斯 威里斯 威风八面 威風八面 娇娘 嬌娘 @@ -19185,7 +19185,7 @@ 尼布甲尼撒 尼布甲尼撒 尼庵 尼庵 尼采 尼采 -尼龙布 尼龍佈 +尼龙布 尼龍布 尽世 盡世 尽义务 盡義務 尽了 盡了 @@ -20591,7 +20591,7 @@ 干片 乾片 干犯 干犯 干犯法 幹犯法 -干球温度 幹球溫度 +干球温度 乾球溫度 干甚 幹甚 干甚么 幹甚麼 干生受 乾生受 @@ -23655,7 +23655,7 @@ 战术轰炸 戰術轟炸 战栗 戰慄 战略伙伴 戰略伙伴 -战略防御倡议 戰略防御倡議 +战略防御倡议 戰略防禦倡議 战胜 戰勝 战胜国 戰勝國 战表 戰表 @@ -23706,7 +23706,7 @@ 戴姆勒克莱斯勒 戴姆勒克萊斯勒 戴希穆克 戴希穆克 戴瑞克罗 戴瑞克羅 -戴维斯杯 戴維斯杯 +戴维斯杯 戴維斯盃 戴胜益 戴勝益 戴胜通 戴勝通 戴蒙 戴蒙 @@ -24988,7 +24988,7 @@ 抵押借款 抵押借款 抵挡不了 抵擋不了 抵牾 牴牾 -抵觸 牴觸 +抵触 牴觸 抵针 抵針 抹了 抹了 抹布 抹布 @@ -27381,7 +27381,7 @@ 新艺术 新藝術 新艺综合体 新藝綜合體 新芬党 新芬黨 -新药 新葯 +新药 新藥 新莺出谷 新鶯出谷 新规范 新規範 新闻价值 新聞價值 @@ -29325,7 +29325,7 @@ 杠头 槓頭 杠子 槓子 杠杆 槓桿 -杠杆收购 杠杆收購 +杠杆收购 槓桿收購 杠杠 槓槓 杠牌 槓牌 杠着 槓着 @@ -29447,7 +29447,7 @@ 杯酒解怨 杯酒解怨 杯酒言欢 杯酒言歡 杯酒释兵权 杯酒釋兵權 -杯里 杯里 +杯里 杯裏 杯面 杯麪 杰乐米 傑樂米 杰伊汉港 傑伊漢港 @@ -30228,7 +30228,7 @@ 核儿 核兒 核冬天 核冬天 核准 覈准 -核准的 覈準的 +核准的 覈准的 核减 覈減 核出口控制 核出口控制 核力 核力 @@ -32183,11 +32183,11 @@ 注释 註釋 注重 注重 注销 註銷 -注音 註音 +注音 注音 注音一式 注音一式 注音字母 注音字母 注音文 注音文 -注音法 註音法 +注音法 注音法 注音符号 注音符號 泪出痛肠 淚出痛腸 泪如泉涌 淚如泉涌 @@ -36025,7 +36025,7 @@ 石油输出国家组织 石油輸出國家組織 石油输出国组织 石油輸出國組織 石灰岩 石灰岩 -石灰岩洞 石灰巖洞 +石灰岩洞 石灰岩洞 石炭系 石炭系 石版术 石版術 石百合 石百合 @@ -36708,7 +36708,7 @@ 种地 種地 种姓 種姓 种姓制 種姓制 -种姓制度 種姓製度 +种姓制度 種姓制度 种子 種子 种子园 種子園 种子地 種子地 @@ -37441,7 +37441,7 @@ 签证 簽證 签证费 簽證費 签诗 籤詩 -签语饼 簽語餅 +签语饼 籤語餅 签赌 簽賭 签赌案 簽賭案 签赌站 簽賭站 @@ -38298,7 +38298,7 @@ 给于 給於 给价 給價 给出 給出 -给我干脆 給我干脆 +给我干脆 給我乾脆 给药 給藥 绚丽多彩 絢麗多彩 绚烂归于平淡 絢爛歸於平淡 @@ -39948,7 +39948,7 @@ 致理技术学院 致理技術學院 致用 致用 致电 致電 -致畸 緻畸 +致畸 致畸 致疑 致疑 致病 致病 致病性 致病性 @@ -42598,8 +42598,8 @@ 计穷虑极 計窮慮極 计算出 計算出 计算出来 計算出來 -计算机制图 計算機制圖 -计算机集成制造 計算機集成制造 +计算机制图 計算機製圖 +计算机集成制造 計算機集成製造 计量制 計量制 订个 訂個 订了 訂了 @@ -43763,7 +43763,7 @@ 足于 足於 足协杯 足協盃 足坛 足壇 -足总杯 足總杯 +足总杯 足總盃 足食丰衣 足食豐衣 趸售物价 躉售物價 趸当 躉當 @@ -46630,7 +46630,7 @@ 阿扎伦卡 阿紮倫卡 阿扎尼亚 阿扎尼亞 阿托品 阿托品 -阿拉伯共同市场 阿拉伯共衕市場 +阿拉伯共同市场 阿拉伯共同市場 阿拉伯联合大公国 阿拉伯聯合大公國 阿拉伯联合酋长国 阿拉伯聯合酋長國 阿拉克 阿拉克 @@ -47704,7 +47704,7 @@ 须发文 須發文 须发皆白 鬚髮皆白 须发表 須發表 -须后水 須後水 +须后水 鬚後水 须子 鬚子 须将有日思无日 須將有日思無日 须弥 須彌 @@ -48132,7 +48132,7 @@ 香熏疗法 香薰療法 香皂 香皂 香菜叶 香菜葉 -香蜡 香 +香蜡 香蠟 香蜡店 香蠟店 香蜡纸马 香蠟紙馬 香蜡铺 香蠟鋪 diff --git a/data/dictionary/TWPhrasesOther.txt b/data/dictionary/TWPhrasesOther.txt index 2753a7d..090a9a0 100644 --- a/data/dictionary/TWPhrasesOther.txt +++ b/data/dictionary/TWPhrasesOther.txt @@ -7,6 +7,7 @@ 涼菜 冷盤 砹 砈 硅 矽 +納米 奈米 詞組 片語 蹦極 笨豬跳 輔音 子音 diff --git a/data/dictionary/TWVariants.txt b/data/dictionary/TWVariants.txt index 7cd8ece..ce6ba24 100644 --- a/data/dictionary/TWVariants.txt +++ b/data/dictionary/TWVariants.txt @@ -5,8 +5,10 @@ 嬀 媯 峯 峰 幺 么 +擡 抬 曬 晒 棱 稜 +檐 簷 污 汙 泄 洩 涌 湧 @@ -20,6 +22,7 @@ 睾 睪 竈 灶 糉 粽 +繮 韁 纔 才 羣 群 蔿 蒍 @@ -27,5 +30,7 @@ 裏 裡 覈 核 踊 踴 +鉢 缽 鮎 鯰 麪 麵 +齶 顎 diff --git a/data/scheme/st_multi.txt b/data/scheme/st_multi.txt index 4b8a0f4..188f0ee 100644 --- a/data/scheme/st_multi.txt +++ b/data/scheme/st_multi.txt @@ -48,7 +48,7 @@ 云 雲 云 「云」意義爲「說」,其餘用「雲」。 人云亦云 雲霧 仆 僕 仆 「仆」意義爲「跌倒」,讀音pu1,「僕」爲「供人使喚的人」,讀音pu2。 前仆後繼 仆街 奴僕 公僕 風塵僕僕 舍 舍 捨 「捨」讀作she3,用於「放棄」意義,其餘用「舍」,讀作she4,古文亦同「捨」。 宿舍 村舍 退避三舍 捨弃 舍我其誰 不舍晝夜 -签 籖 簽 「簽」用於動詞,表示「題字題名」,其餘用「籤」。 簽名 簽證 標籤 書籤 牙籤 +签 籤 簽 「簽」用於動詞,表示「題字題名」,其餘用「籤」。 簽名 簽證 標籤 書籤 牙籤 折 折 摺 與「叠」有關用「摺」,與「斷」有關用「折」。 摺紙 摺扇 存摺 折斷 折腰 折服 打折 損兵折將 谷 谷 穀 表示「兩山之間」的地域用「谷」,表示農作物時用「穀」。 山谷 稻穀 几 幾 几 「几」只用作「茶几」。表示「幾乎」、「幾個」意義用「幾」。 茶几 幾乎 幾個 diff --git a/doc/CMakeLists.txt b/doc/CMakeLists.txt index 9213404..c16e289 100644 --- a/doc/CMakeLists.txt +++ b/doc/CMakeLists.txt @@ -29,7 +29,7 @@ if(BUILD_DOCUMENTATION) DIRECTORY ${CMAKE_BINARY_DIR}/doc/html DESTINATION - ${DIR_SHARE_OPENCC}doc + ${DIR_SHARE_OPENCC}/doc ) set_directory_properties( diff --git a/node/binding.cc b/node/binding.cc index 7ae6bfa..b948e55 100644 --- a/node/binding.cc +++ b/node/binding.cc @@ -3,6 +3,23 @@ #include "Config.hpp" #include "Converter.hpp" +#include "DictConverter.hpp" + +// For faster build +#include "BinaryDict.cpp" +#include "Config.cpp" +#include "Conversion.cpp" +#include "ConversionChain.cpp" +#include "Converter.cpp" +#include "DartsDict.cpp" +#include "Dict.cpp" +#include "DictConverter.cpp" +#include "DictEntry.cpp" +#include "DictGroup.cpp" +#include "MaxMatchSegmentation.cpp" +#include "Segmentation.cpp" +#include "TextDict.cpp" +#include "UTF8Util.cpp" using namespace opencc; @@ -38,12 +55,16 @@ class OpenccBinding : public Nan::ObjectWrap { return converter_->Convert(input); } + static NAN_METHOD(Version) { + info.GetReturnValue().Set(Nan::New(VERSION).ToLocalChecked()); + } + static NAN_METHOD(New) { OpenccBinding* instance; try { if (info.Length() >= 1 && info[0]->IsString()) { - string configFile = ToUtf8String(info[0]); + const string configFile = ToUtf8String(info[0]); instance = new OpenccBinding(configFile); } else { instance = new OpenccBinding("s2t.json"); @@ -111,7 +132,7 @@ class OpenccBinding : public Nan::ObjectWrap { OpenccBinding* instance = Nan::ObjectWrap::Unwrap(info.This()); - string input = ToUtf8String(info[0]); + const string input = ToUtf8String(info[0]); string output; try { output = instance->Convert(input); @@ -124,11 +145,31 @@ class OpenccBinding : public Nan::ObjectWrap { info.GetReturnValue().Set(converted); } + static NAN_METHOD(GenerateDict) { + if (info.Length() < 4 || !info[0]->IsString() || !info[1]->IsString() + || !info[2]->IsString() || !info[3]->IsString()) { + Nan::ThrowTypeError("Wrong arguments"); + return; + } + const string inputFileName = ToUtf8String(info[0]); + const string outputFileName = ToUtf8String(info[1]); + const string formatFrom = ToUtf8String(info[2]); + const string formatTo = ToUtf8String(info[3]); + try { + opencc::ConvertDictionary(inputFileName, outputFileName, formatFrom, formatTo); + } catch (opencc::Exception& e) { + Nan::ThrowError(e.what()); + } + } + static NAN_MODULE_INIT(Init) { // Prepare constructor template v8::Local tpl = Nan::New(OpenccBinding::New); tpl->SetClassName(Nan::New("Opencc").ToLocalChecked()); tpl->InstanceTemplate()->SetInternalFieldCount(1); + // Methods + Nan::SetMethod(tpl, "version", Version); + Nan::SetMethod(tpl, "generateDict", GenerateDict); // Prototype Nan::SetPrototypeMethod(tpl, "convert", Convert); Nan::SetPrototypeMethod(tpl, "convertSync", ConvertSync); diff --git a/node/demo.js b/node/demo.js index eba2da5..c3890aa 100644 --- a/node/demo.js +++ b/node/demo.js @@ -26,16 +26,23 @@ */ // In your project you should replace './opencc' with 'opencc' -var OpenCC = require('./opencc'); +const OpenCC = require('./opencc'); + +console.log('OpenCC version', OpenCC.version); // Load the default Simplified to Traditional config -var opencc = new OpenCC('s2t.json'); +const opencc = new OpenCC('s2t.json'); // Sync API -var converted = opencc.convertSync("汉字"); +const converted = opencc.convertSync("汉字"); console.log(converted); // Async API -opencc.convert("汉字", function (err, converted) { +opencc.convert("汉字", (err, converted) => { + console.log(err, converted); +}); + +// Async API with Promise +opencc.convertPromise("汉字").then(converted => { console.log(converted); }); diff --git a/node/dict.js b/node/dict.js new file mode 100644 index 0000000..59f471f --- /dev/null +++ b/node/dict.js @@ -0,0 +1,6 @@ +const OpenCC = require('./opencc'); + +const input = process.argv[2]; +const output = process.argv[3]; + +OpenCC.generateDict(input, output, "text", "ocd"); diff --git a/node/dicts.gypi b/node/dicts.gypi index 9cf4d06..3b96381 100644 --- a/node/dicts.gypi +++ b/node/dicts.gypi @@ -3,7 +3,7 @@ "target_name": "dicts", "type": "none", "variables": { - "cmd": "<(PRODUCT_DIR)/opencc_dict", + "cmd": "<(module_root_dir)/node/dict.js", "dict_merge": "<(module_root_dir)/data/scripts/merge.py", "dict_reverse": "<(module_root_dir)/data/scripts/reverse.py", "input_prefix": "<(module_root_dir)/data/dictionary/", @@ -14,57 +14,57 @@ "variables": { "input": "<(input_prefix)STCharacters.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)STCharacters.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "STPhrases", "variables": { "input": "<(input_prefix)STPhrases.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)STPhrases.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "TSCharacters", "variables": { "input": "<(input_prefix)TSCharacters.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)TSCharacters.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "TSPhrases", "variables": { "input": "<(input_prefix)TSPhrases.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)TSPhrases.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "TWVariants", "variables": { "input": "<(input_prefix)TWVariants.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)TWVariants.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "TWVariantsRevPhrases", "variables": { "input": "<(input_prefix)TWVariantsRevPhrases.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)TWVariantsRevPhrases.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "JPVariants", "variables": { "input": "<(input_prefix)JPVariants.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)JPVariants.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "TWPhrases.txt", "inputs": ["<(cmd)"], @@ -75,7 +75,7 @@ "variables": { "input": "<(input_prefix)TWVariants.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)TWVariantsRev.txt"], "action": ["python", "<(dict_reverse)", "<(input)", "<@(_outputs)"] }, { @@ -83,7 +83,7 @@ "variables": { "input": "<(output_prefix)TWPhrases.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)TWPhrasesRev.txt"], "action": ["python", "<(dict_reverse)", "<(input)", "<@(_outputs)"] }, { @@ -91,55 +91,55 @@ "variables": { "input": "<(output_prefix)TWPhrases.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)TWPhrases.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "TWVariantsRev", "variables": { "input": "<(output_prefix)TWVariantsRev.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)TWVariantsRev.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "TWPhrasesRev", "variables": { "input": "<(output_prefix)TWPhrasesRev.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)TWPhrasesRev.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "HKVariants", "variables": { "input": "<(input_prefix)HKVariants.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)HKVariants.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "HKVariantsPhrases", "variables": { "input": "<(input_prefix)HKVariantsPhrases.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)HKVariantsPhrases.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "HKVariantsRevPhrases", "variables": { "input": "<(input_prefix)HKVariantsRevPhrases.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)HKVariantsRevPhrases.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }, { "action_name": "HKVariantsRev.txt", "variables": { "input": "<(input_prefix)HKVariants.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)HKVariantsRev.txt"], "action": ["python", "<(dict_reverse)", "<(input)", "<@(_outputs)"] }, { @@ -147,12 +147,12 @@ "variables": { "input": "<(output_prefix)HKVariantsRev.txt", }, - "inputs": ["<(cmd)", "<(input)"], + "inputs": ["<(input)"], "outputs": ["<(output_prefix)HKVariantsRev.ocd"], - "action": ["<(cmd)", "-i", "<(input)", "-o", "<@(_outputs)", "--from text", "--to ocd"] + "action": ["node", "<(cmd)", "<(input)", "<@(_outputs)"] }], "dependencies": [ - "opencc_dict" + "binding" ] }] } diff --git a/node/global.gypi b/node/global.gypi index 433f1b7..5becc15 100644 --- a/node/global.gypi +++ b/node/global.gypi @@ -1,6 +1,6 @@ { "variables": { - "opencc_version": "1.0.4" + "opencc_version": "1.0.5" }, "target_defaults": { "defines": [ diff --git a/node/node_binding.gypi b/node/node_binding.gypi index 1bb5c6b..6022195 100644 --- a/node/node_binding.gypi +++ b/node/node_binding.gypi @@ -3,19 +3,6 @@ "target_name": "binding", "sources": [ "../node/binding.cc", - "../src/BinaryDict.cpp", - "../src/Config.cpp", - "../src/Conversion.cpp", - "../src/ConversionChain.cpp", - "../src/Converter.cpp", - "../src/DartsDict.cpp", - "../src/Dict.cpp", - "../src/DictEntry.cpp", - "../src/DictGroup.cpp", - "../src/MaxMatchSegmentation.cpp", - "../src/Segmentation.cpp", - "../src/TextDict.cpp", - "../src/UTF8Util.cpp", ], "include_dirs": [ "../src", diff --git a/node/opencc.js b/node/opencc.js index 4604b9b..70f9564 100644 --- a/node/opencc.js +++ b/node/opencc.js @@ -54,6 +54,33 @@ var OpenCC = module.exports = function (config) { this.handler = new binding.Opencc(config); }; +/** + * The version of OpenCC library. + * + * @fn OpenCC.version + * @memberof OpenCC + * @ingroup node_api + */ +OpenCC.version = binding.Opencc.version(); + +/** + * Generates dictionary from another format. + * + * @fn string generateDict(string inputFileName, string outputFileName, string formatFrom, string formatTo) + * @memberof OpenCC + * @param inputFileName Input dictionary filename. + * @param outputFileName Output dictionary filename. + * @param formatFrom Input dictionary format. + * @param formatTo Input dictionary format. + * @return Converted text. + * @ingroup node_api + */ +OpenCC.generateDict = function(inputFileName, outputFileName, + formatFrom, formatTo) { + return binding.Opencc.generateDict(inputFileName, outputFileName, + formatFrom, formatTo); +} + /** * Converts input text. * @@ -79,3 +106,22 @@ OpenCC.prototype.convert = function (input, callback) { OpenCC.prototype.convertSync = function (input) { return this.handler.convertSync(input.toString()); }; + +/** + * Converts input text asynchronously and returns a Promise. + * + * @fn Promise convertPromise(string input) + * @memberof OpenCC + * @param input Input text. + * @return The Promise that will yield the converted text. + * @ingroup node_api + */ +OpenCC.prototype.convertPromise = function (input) { + const self = this; + return new Promise(function(resolve, reject) { + self.handler.convert(input.toString(), function(err, text) { + if (err) reject(err); + else resolve(text); + }); + }); +}; diff --git a/node/opencc_dict.gypi b/node/opencc_dict.gypi deleted file mode 100644 index 879f871..0000000 --- a/node/opencc_dict.gypi +++ /dev/null @@ -1,21 +0,0 @@ -{ - "targets": [{ - "target_name": "opencc_dict", - "type": "executable", - "sources": [ - "../src/BinaryDict.cpp", - "../src/DartsDict.cpp", - "../src/Dict.cpp", - "../src/DictEntry.cpp", - "../src/DictGroup.cpp", - "../src/TextDict.cpp", - "../src/UTF8Util.cpp", - "../src/tools/DictConverter.cpp", - ], - "include_dirs": [ - "../src", - "../deps/darts-clone", - "../deps/tclap-1.2.1" - ] - }] -} diff --git a/package.json b/package.json index f545798..4d7233b 100644 --- a/package.json +++ b/package.json @@ -1,9 +1,9 @@ { "name": "opencc", - "version": "1.0.4", + "version": "1.0.5", "description": "Conversion between Traditional and Simplified Chinese", "author": "BYVoid ", - "license": "Apache", + "license": "Apache-2.0", "main": "node/opencc.js", "scripts": { "test": "mocha -R spec node/test.js" @@ -27,6 +27,6 @@ "mocha": "2.2.5" }, "dependencies": { - "nan": "^2.2.0" + "nan": "^2.5.1" } } diff --git a/src/BinaryDict.cpp b/src/BinaryDict.cpp index 9354fc8..87a215d 100644 --- a/src/BinaryDict.cpp +++ b/src/BinaryDict.cpp @@ -30,10 +30,10 @@ size_t BinaryDict::KeyMaxLength() const { } void BinaryDict::SerializeToFile(FILE* fp) const { - string keyBuffer, valueBuffer; + string keyBuf, valueBuf; vector keyOffsets, valueOffsets; size_t keyTotalLength = 0, valueTotalLength = 0; - ConstructBuffer(keyBuffer, keyOffsets, keyTotalLength, valueBuffer, + ConstructBuffer(keyBuf, keyOffsets, keyTotalLength, valueBuf, valueOffsets, valueTotalLength); // Number of items size_t numItems = lexicon->Length(); @@ -41,9 +41,9 @@ void BinaryDict::SerializeToFile(FILE* fp) const { // Data fwrite(&keyTotalLength, sizeof(size_t), 1, fp); - fwrite(keyBuffer.c_str(), sizeof(char), keyTotalLength, fp); + fwrite(keyBuf.c_str(), sizeof(char), keyTotalLength, fp); fwrite(&valueTotalLength, sizeof(size_t), 1, fp); - fwrite(valueBuffer.c_str(), sizeof(char), valueTotalLength, fp); + fwrite(valueBuf.c_str(), sizeof(char), valueTotalLength, fp); size_t keyCursor = 0, valueCursor = 0; for (const DictEntry* entry : *lexicon) { @@ -131,8 +131,8 @@ BinaryDictPtr BinaryDict::NewFromFile(FILE* fp) { return dict; } -void BinaryDict::ConstructBuffer(string& keyBuffer, vector& keyOffset, - size_t& keyTotalLength, string& valueBuffer, +void BinaryDict::ConstructBuffer(string& keyBuf, vector& keyOffset, + size_t& keyTotalLength, string& valueBuf, vector& valueOffset, size_t& valueTotalLength) const { keyTotalLength = 0; @@ -152,28 +152,28 @@ void BinaryDict::ConstructBuffer(string& keyBuffer, vector& keyOffset, } } // Write keys and values to buffers - keyBuffer.resize(keyTotalLength, '\0'); - valueBuffer.resize(valueTotalLength, '\0'); - char* pKeyBuffer = const_cast(keyBuffer.c_str()); - char* pValueBuffer = const_cast(valueBuffer.c_str()); + keyBuf.resize(keyTotalLength, '\0'); + valueBuf.resize(valueTotalLength, '\0'); + char* pKeyBuffer = const_cast(keyBuf.c_str()); + char* pValueBuffer = const_cast(valueBuf.c_str()); for (const DictEntry* entry : *lexicon) { strcpy(pKeyBuffer, entry->Key()); - keyOffset.push_back(pKeyBuffer - keyBuffer.c_str()); + keyOffset.push_back(pKeyBuffer - keyBuf.c_str()); pKeyBuffer += entry->KeyLength() + 1; if (entry->NumValues() == 1) { const auto* svEntry = static_cast(entry); strcpy(pValueBuffer, svEntry->Value()); - valueOffset.push_back(pValueBuffer - valueBuffer.c_str()); + valueOffset.push_back(pValueBuffer - valueBuf.c_str()); pValueBuffer += strlen(svEntry->Value()) + 1; } else { const auto* mvEntry = static_cast(entry); for (const auto& value : mvEntry->Values()) { strcpy(pValueBuffer, value); - valueOffset.push_back(pValueBuffer - valueBuffer.c_str()); + valueOffset.push_back(pValueBuffer - valueBuf.c_str()); pValueBuffer += strlen(value) + 1; } } } - assert(keyBuffer.c_str() + keyTotalLength == pKeyBuffer); - assert(valueBuffer.c_str() + valueTotalLength == pValueBuffer); + assert(keyBuf.c_str() + keyTotalLength == pKeyBuffer); + assert(valueBuf.c_str() + valueTotalLength == pValueBuffer); } diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt index 68883a2..498a4f6 100644 --- a/src/CMakeLists.txt +++ b/src/CMakeLists.txt @@ -16,6 +16,7 @@ set( Converter.hpp DartsDict.hpp Dict.hpp + DictConverter.hpp DictEntry.hpp DictGroup.hpp Exception.hpp @@ -43,6 +44,7 @@ set( Converter.cpp DartsDict.cpp Dict.cpp + DictConverter.cpp DictEntry.cpp DictGroup.cpp MaxMatchSegmentation.cpp @@ -54,7 +56,8 @@ set( UTF8Util.cpp ) -add_library(libopencc ${LIBOPENCC_SOURCES}) +add_library(libopencc ${LIBOPENCC_SOURCES} ${LIBOPENCC_HEADERS}) +source_group(libopencc FILES ${LIBOPENCC_SOURCES} ${LIBOPENCC_HEADERS}) GENERATE_EXPORT_HEADER( libopencc diff --git a/src/Config.cpp b/src/Config.cpp index 35bc89b..f302585 100644 --- a/src/Config.cpp +++ b/src/Config.cpp @@ -94,7 +94,7 @@ public: DictPtr ParseDict(const JSONValue& doc) { // Required: type string type = GetStringProperty(doc, "type"); - DictPtr dict; + if (type == "group") { list dicts; const JSONValue& docs = GetArrayProperty(doc, "dicts"); @@ -114,6 +114,7 @@ public: if (cache != nullptr) { return cache; } + DictPtr dict; if (type == "text") { dict = LoadDictWithPaths(fileName); } else if (type == "ocd") { @@ -231,7 +232,10 @@ ConverterPtr Config::NewFromString(const string& json, } ConfigInternal* impl = (ConfigInternal*)internal; - impl->configDirectory = configDirectory; + if (configDirectory.back() == '/' || configDirectory.back() == '\\') + impl->configDirectory = configDirectory; + else + impl->configDirectory = configDirectory + '/'; // Required: segmentation SegmentationPtr segmentation = diff --git a/src/ConfigTest.cpp b/src/ConfigTest.cpp index 080cccf..6285f7a 100644 --- a/src/ConfigTest.cpp +++ b/src/ConfigTest.cpp @@ -57,4 +57,14 @@ TEST_F(ConfigTest, NonexistingPath) { } } +TEST_F(ConfigTest, NewFromStringWitoutTrailingSlash) { + std::ifstream ifs(CONFIG_TEST_PATH); + string content(std::istreambuf_iterator(ifs), + (std::istreambuf_iterator())); + string pathWithoutTrailingSlash = CMAKE_SOURCE_DIR "/test/config_test"; + + const ConverterPtr converter = config.NewFromString( + content, pathWithoutTrailingSlash); +} + } // namespace opencc diff --git a/src/DartsDict.cpp b/src/DartsDict.cpp index cfcc2c4..75f4c3d 100644 --- a/src/DartsDict.cpp +++ b/src/DartsDict.cpp @@ -154,7 +154,6 @@ void DartsDict::SerializeToFile(FILE* fp) const { fwrite(&dartsSize, sizeof(size_t), 1, fp); fwrite(dict.array(), sizeof(char), dartsSize, fp); - auto internal = this->internal; internal->binary.reset(new BinaryDict(lexicon)); internal->binary->SerializeToFile(fp); } diff --git a/src/Dict.cpp b/src/Dict.cpp index 755b67c..0e3f56d 100644 --- a/src/Dict.cpp +++ b/src/Dict.cpp @@ -30,7 +30,7 @@ Optional Dict::MatchPrefix(const char* word) const { if (!result.IsNull()) { return result; } - len -= UTF8Util::PrevCharLength(wordTruncPtr); + len -= static_cast(UTF8Util::PrevCharLength(wordTruncPtr)); } return Optional::Null(); } @@ -40,7 +40,7 @@ vector Dict::MatchAllPrefixes(const char* word) const { string wordTrunc = UTF8Util::TruncateUTF8(word, KeyMaxLength()); const char* wordTruncPtr = wordTrunc.c_str() + wordTrunc.length(); for (long len = static_cast(wordTrunc.length()); len > 0; - len -= UTF8Util::PrevCharLength(wordTruncPtr)) { + len -= static_cast(UTF8Util::PrevCharLength(wordTruncPtr))) { wordTrunc.resize(static_cast(len)); wordTruncPtr = wordTrunc.c_str() + len; const Optional& result = Match(wordTrunc.c_str()); diff --git a/src/DictConverter.cpp b/src/DictConverter.cpp new file mode 100644 index 0000000..06f37c1 --- /dev/null +++ b/src/DictConverter.cpp @@ -0,0 +1,57 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2017 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "DartsDict.hpp" +#include "DictConverter.hpp" +#include "TextDict.hpp" + +using namespace opencc; + +DictPtr LoadDictionary(const string& format, const string& inputFileName) { + if (format == "text") { + return SerializableDict::NewFromFile(inputFileName); + } else if (format == "ocd") { + return SerializableDict::NewFromFile(inputFileName); + } else { + fprintf(stderr, "Unknown dictionary format: %s\n", format.c_str()); + exit(2); + } + return nullptr; +} + +SerializableDictPtr ConvertDict(const string& format, + const DictPtr dict) { + if (format == "text") { + return TextDict::NewFromDict(*dict.get()); + } else if (format == "ocd") { + return DartsDict::NewFromDict(*dict.get()); + } else { + fprintf(stderr, "Unknown dictionary format: %s\n", format.c_str()); + exit(2); + } + return nullptr; +} + +namespace opencc { +void ConvertDictionary(const string inputFileName, const string outputFileName, + const string formatFrom, const string formatTo) { + DictPtr dictFrom = LoadDictionary(formatFrom, inputFileName); + SerializableDictPtr dictTo = ConvertDict(formatTo, dictFrom); + dictTo->SerializeToFile(outputFileName); +} +} diff --git a/src/DictConverter.hpp b/src/DictConverter.hpp new file mode 100644 index 0000000..f59c5ec --- /dev/null +++ b/src/DictConverter.hpp @@ -0,0 +1,30 @@ +/* + * Open Chinese Convert + * + * Copyright 2010-2017 BYVoid + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "Common.hpp" + +namespace opencc { +/** +* Converts a dictionary from a format to another. +* @ingroup opencc_cpp_api +*/ +void ConvertDictionary(const string inputFileName, const string outputFileName, + const string formatFrom, const string formatTo); +} diff --git a/src/DictEntry.hpp b/src/DictEntry.hpp index ecef489..0a840a6 100644 --- a/src/DictEntry.hpp +++ b/src/DictEntry.hpp @@ -140,11 +140,11 @@ public: size_t NumValues() const { return values.size(); } vector Values() const { - vector values; + vector retsult; for (const string& value : this->values) { - values.push_back(value.c_str()); + retsult.push_back(value.c_str()); } - return values; + return retsult; } private: diff --git a/src/Exception.hpp b/src/Exception.hpp index a1134e8..33cb5d9 100644 --- a/src/Exception.hpp +++ b/src/Exception.hpp @@ -32,7 +32,7 @@ namespace opencc { -class OPENCC_EXPORT Exception : public std::exception { +class OPENCC_EXPORT Exception { public: Exception() {} diff --git a/src/PhraseExtract.cpp b/src/PhraseExtract.cpp index 17b46d4..495ab31 100644 --- a/src/PhraseExtract.cpp +++ b/src/PhraseExtract.cpp @@ -1,4 +1,4 @@ -/* +/* * Open Chinese Convert * * Copyright 2015 BYVoid @@ -171,8 +171,9 @@ void PhraseExtract::ExtractSuffixes() { std::min(static_cast(wordMaxLength + suffixSetLength), text.UTF8Length()); const UTF8StringSlice& slice = text.Left(suffixLength); - suffixes.push_back(UTF8StringSlice8Bit(slice.CString(), slice.UTF8Length(), - slice.ByteLength())); + suffixes.push_back(UTF8StringSlice8Bit(slice.CString(), + static_cast(slice.UTF8Length()), + static_cast(slice.ByteLength()))); } suffixes.shrink_to_fit(); // Sort suffixes @@ -189,8 +190,10 @@ void PhraseExtract::ExtractPrefixes() { std::min(static_cast(wordMaxLength + prefixSetLength), text.UTF8Length()); const UTF8StringSlice& slice = text.Right(prefixLength); - prefixes.push_back(UTF8StringSlice8Bit(slice.CString(), slice.UTF8Length(), - slice.ByteLength())); + prefixes.push_back(UTF8StringSlice8Bit(slice.CString(), + static_cast(slice.UTF8Length()), + static_cast(slice.ByteLength()))); + } prefixes.shrink_to_fit(); // Sort suffixes reversely @@ -206,7 +209,7 @@ void PhraseExtract::CalculateFrequency() { ExtractSuffixes(); } for (const auto& suffix : suffixes) { - for (size_t i = 1; i <= suffix.UTF8Length() && i <= wordMaxLength; i++) { + for (UTF8StringSlice8Bit::LengthType i = 1; i <= suffix.UTF8Length() && i <= wordMaxLength; i++) { const UTF8StringSlice8Bit wordCandidate = suffix.Left(i); signals->AddKey(wordCandidate).frequency++; totalOccurrence++; @@ -263,6 +266,7 @@ void CalculatePrefixSuffixEntropy( const std::function& updateEntropy) { AdjacentSetType adjacentSet; + auto setLength8Bit = static_cast(setLength); for (PhraseExtract::LengthType length = wordMinLength; length <= wordMaxLength; length++) { adjacentSet.clear(); @@ -271,19 +275,20 @@ void CalculatePrefixSuffixEntropy( if (presuffix.UTF8Length() < length) { continue; } + auto length8Bit = static_cast(length); const auto& wordCandidate = - SUFFIX ? presuffix.Left(length) : presuffix.Right(length); + SUFFIX ? presuffix.Left(length8Bit) : presuffix.Right(length8Bit); if (wordCandidate != lastWord) { updateEntropy(lastWord, adjacentSet); lastWord = wordCandidate; } if (length + setLength <= presuffix.UTF8Length()) { if (SUFFIX) { - const auto& wordSuffix = presuffix.SubString(length, setLength); + const auto& wordSuffix = presuffix.SubString(length8Bit, setLength8Bit); adjacentSet[wordSuffix]++; } else { const auto& wordPrefix = presuffix.SubString( - presuffix.UTF8Length() - length - setLength, setLength); + presuffix.UTF8Length() - length8Bit - setLength8Bit, setLength8Bit); adjacentSet[wordPrefix]++; } } @@ -393,7 +398,7 @@ double PhraseExtract::CalculateCohesion( const UTF8StringSlice8Bit& wordCandidate) const { // TODO Try average value double minPMI = INFINITY; - for (LengthType leftLength = 1; leftLength <= wordCandidate.UTF8Length() - 1; + for (UTF8StringSlice8Bit::LengthType leftLength = 1; leftLength <= wordCandidate.UTF8Length() - 1; leftLength++) { const auto& leftPart = wordCandidate.Left(leftLength); const auto& rightPart = diff --git a/src/PhraseExtract.hpp b/src/PhraseExtract.hpp index 4b11bd8..5397c86 100644 --- a/src/PhraseExtract.hpp +++ b/src/PhraseExtract.hpp @@ -25,7 +25,7 @@ namespace opencc { -class PhraseExtract { +class OPENCC_EXPORT PhraseExtract { public: typedef UTF8StringSlice::LengthType LengthType; diff --git a/src/SimpleConverter.cpp b/src/SimpleConverter.cpp index b3dfb60..a8e2ff6 100644 --- a/src/SimpleConverter.cpp +++ b/src/SimpleConverter.cpp @@ -133,14 +133,9 @@ opencc_t opencc_open(const char* configFileName) { #endif int opencc_close(opencc_t opencc) { - try { - SimpleConverter* instance = reinterpret_cast(opencc); - delete instance; - return 0; - } catch (std::exception& ex) { - cError = ex.what(); - return 1; - } + SimpleConverter* instance = reinterpret_cast(opencc); + delete instance; + return 0; } size_t opencc_convert_utf8_to_buffer(opencc_t opencc, const char* input, diff --git a/src/UTF8StringSlice.hpp b/src/UTF8StringSlice.hpp index 1a18b4b..9600631 100644 --- a/src/UTF8StringSlice.hpp +++ b/src/UTF8StringSlice.hpp @@ -40,10 +40,12 @@ inline size_t FNVHash<4>(const char* text, const size_t byteLength) { return FNVHash(text, byteLength, 16777619UL, 2166136261UL); } +#if SIZE_MAX == 0xffffffffffffffff template <> inline size_t FNVHash<8>(const char* text, const size_t byteLength) { return FNVHash(text, byteLength, 1099511628211UL, 14695981039346656037UL); } +#endif } // namespace internal @@ -52,8 +54,8 @@ public: typedef LENGTH_TYPE LengthType; UTF8StringSliceBase(const char* _str) - : str(_str), utf8Length(UTF8Util::Length(_str)), - byteLength(strlen(_str)) {} + : str(_str), utf8Length(static_cast(UTF8Util::Length(_str))), + byteLength(static_cast(strlen(_str))) {} UTF8StringSliceBase(const char* _str, const LengthType _utf8Length) : str(_str), utf8Length(_utf8Length) { @@ -70,36 +72,36 @@ public: LengthType ByteLength() const { return byteLength; } - UTF8StringSliceBase Left(const LengthType utf8Length) const { - if (utf8Length == UTF8Length()) { + UTF8StringSliceBase Left(const LengthType numberOfCharacters) const { + if (numberOfCharacters == UTF8Length()) { return *this; } else { - return UTF8StringSliceBase(str, utf8Length); + return UTF8StringSliceBase(str, numberOfCharacters); } } - UTF8StringSliceBase Right(const LengthType utf8Length) const { - if (utf8Length == UTF8Length()) { + UTF8StringSliceBase Right(const LengthType numberOfCharacters) const { + if (numberOfCharacters == UTF8Length()) { return *this; } else { const char* pstr = str + byteLength; - for (size_t i = 0; i < utf8Length; i++) { + for (size_t i = 0; i < numberOfCharacters; i++) { pstr = UTF8Util::PrevChar(pstr); } - return UTF8StringSliceBase(pstr, utf8Length); + return UTF8StringSliceBase(pstr, numberOfCharacters); } } UTF8StringSliceBase SubString(const LengthType offset, - const LengthType utf8Length) const { + const LengthType numberOfCharacters) const { if (offset == 0) { - return Left(utf8Length); + return Left(numberOfCharacters); } else { const char* pstr = str; for (size_t i = 0; i < offset; i++) { pstr = UTF8Util::NextChar(pstr); } - return UTF8StringSliceBase(pstr, utf8Length); + return UTF8StringSliceBase(pstr, numberOfCharacters); } } @@ -223,7 +225,7 @@ private: for (size_t i = 0; i < utf8Length; i++) { pstr = UTF8Util::NextChar(pstr); } - byteLength = pstr - str; + byteLength = static_cast(pstr - str); } const char* str; diff --git a/src/UTF8Util.hpp b/src/UTF8Util.hpp index ea03dc0..70bbf83 100644 --- a/src/UTF8Util.hpp +++ b/src/UTF8Util.hpp @@ -262,20 +262,22 @@ public: #ifdef _MSC_VER static std::string U16ToU8(const std::wstring& wstr) { std::string ret; - int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), wstr.length(), NULL, 0, NULL, NULL); + int length = static_cast(wstr.length()); + int convcnt = WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, NULL, 0, NULL, NULL); if (convcnt > 0) { ret.resize(convcnt); - WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), wstr.length(), &ret[0], convcnt, NULL, NULL); + WideCharToMultiByte(CP_UTF8, 0, wstr.c_str(), length, &ret[0], convcnt, NULL, NULL); } return ret; } static std::wstring U8ToU16(const std::string& str) { std::wstring ret; - int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), NULL, 0); + int length = static_cast(str.length()); + int convcnt = MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, NULL, 0); if (convcnt > 0) { ret.resize(convcnt); - MultiByteToWideChar(CP_UTF8, 0, str.c_str(), str.length(), &ret[0], convcnt); + MultiByteToWideChar(CP_UTF8, 0, str.c_str(), length, &ret[0], convcnt); } return ret; } diff --git a/src/tools/DictConverter.cpp b/src/tools/DictConverter.cpp index 6383830..868e16d 100644 --- a/src/tools/DictConverter.cpp +++ b/src/tools/DictConverter.cpp @@ -17,43 +17,10 @@ */ #include "CmdLineOutput.hpp" -#include "DartsDict.hpp" -#include "TextDict.hpp" +#include "DictConverter.hpp" using namespace opencc; -DictPtr LoadDictionary(const string& format, const string& inputFileName) { - if (format == "text") { - return SerializableDict::NewFromFile(inputFileName); - } else if (format == "ocd") { - return SerializableDict::NewFromFile(inputFileName); - } else { - fprintf(stderr, "Unknown dictionary format: %s\n", format.c_str()); - exit(2); - } - return nullptr; -} - -SerializableDictPtr ConvertDictionary(const string& format, - const DictPtr dict) { - if (format == "text") { - return TextDict::NewFromDict(*dict.get()); - } else if (format == "ocd") { - return DartsDict::NewFromDict(*dict.get()); - } else { - fprintf(stderr, "Unknown dictionary format: %s\n", format.c_str()); - exit(2); - } - return nullptr; -} - -void ConvertDictionary(const string inputFileName, const string outputFileName, - const string formatFrom, const string formatTo) { - DictPtr dictFrom = LoadDictionary(formatFrom, inputFileName); - SerializableDictPtr dictTo = ConvertDictionary(formatTo, dictFrom); - dictTo->SerializeToFile(outputFileName); -} - int main(int argc, const char* argv[]) { try { TCLAP::CmdLine cmd("Open Chinese Convert (OpenCC) Dictionary Tool", ' ', -- 2.30.2